wordbankr


wordbankr

library(wordbankr)
help(package = "wordbankr")
ls("package:wordbankr")
##  [1] "fit_aoa"                 "fit_vocab_quantiles"    
##  [3] "get_administration_data" "get_crossling_data"     
##  [5] "get_crossling_items"     "get_instrument_data"    
##  [7] "get_instruments"         "get_item_data"          
##  [9] "get_sources"             "summarise_items"

wordbankr: instruments

get_instruments()

wordbankr: sources

get_sources()

wordbankr: sources

get_sources(language = "English (American)")

wordbankr: administrations

admins_eng_ws <- get_administration_data(language = "English (American)", form = "WS")
admins_eng_ws
n_distinct(admins_eng_ws$data_id)
## [1] 5520

wordbankr: administrations

admins_eng_ws %>% count(age)

wordbankr: administrations

ggplot(admins_eng_ws, aes(x = age, y = production)) +
  geom_jitter(colour = "grey", size = 0.5) +
  geom_smooth() +
  labs(x = "Age (months)", y = "Productive vocabulary size")+theme_classic()

wordbankr: administrations

admins_russian <- get_administration_data(language = "Russian")
admins_russian

wordbankr: administrations

admins_ws <- get_administration_data(form = "WS")
admins_ws

wordbankr: administrations

admins <- get_administration_data()
admins
nrow(admins)
## [1] 82055

wordbankr: administrations

admins %>% count(language, form)

wordbankr: items

items_eng_ws <- get_item_data(language = "English (American)", form = "WS")
items_eng_ws

wordbankr: items

items_eng_ws %>% distinct(type)

wordbankr: items

items_eng_ws %>% distinct(category)

wordbankr: items

items_eng_ws %>% distinct(lexical_category)

wordbankr: items

items <- get_item_data()
items

wordbankr: items

items %>% count(language, form)

wordbankr: data

ids <- items_eng_ws %>%
  filter(definition %in% c("dog", "cat")) %>%
  pull(item_id)
get_instrument_data(language = "English (American)", form = "WS", items = ids)

wordbankr: data

get_instrument_data(language = "English (American)", form = "WS",
                    items = ids, administrations = TRUE, iteminfo = TRUE)

wordbankr: data

twos <- admins_eng_ws %>% filter(age == 24)
dog_cat <- items_eng_ws %>% filter(definition %in% c("dog", "cat"))
get_instrument_data(language = "English (American)", form = "WS", items = dog_cat$item_id,
                    administrations = twos, iteminfo = dog_cat)

wordbankr

Exercises

  1. Compute and plot median productive vocabulary size (as proportion of total words) over age in each language. Limit to WS data for children 16-30 months old (hint: left_join and facet_wrap are likely to be helpful).

  2. For English WS data, compute and plot the proportion of children that produce each word in the “toys” category at each age.

  3. (Bonus: Do the same thing as in 2 but separately for girls and boys.)

Exercise 1

items 

Exercise 1

items %>%
  filter(form == "WS", type == "word") 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

admins 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

admins %>%
  filter(form == "WS", age >= 16, age <= 30) 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

admins %>%
  filter(form == "WS", age >= 16, age <= 30) %>%
  left_join(num_words) 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

admins %>%
  filter(form == "WS", age >= 16, age <= 30) %>%
  left_join(num_words) %>%
  mutate(prop_vocab = production / words) 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

admins %>%
  filter(form == "WS", age >= 16, age <= 30) %>%
  left_join(num_words) %>%
  mutate(prop_vocab = production / words) %>%
  group_by(language, age) 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

admins %>%
  filter(form == "WS", age >= 16, age <= 30) %>%
  left_join(num_words) %>%
  mutate(prop_vocab = production / words) %>%
  group_by(language, age) %>%
  summarise(median_vocab = median(prop_vocab)) 

Exercise 1

items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n()) ->
  num_words

admins %>%
  filter(form == "WS", age >= 16, age <= 30) %>%
  left_join(num_words) %>%
  mutate(prop_vocab = production / words) %>%
  group_by(language, age) %>%
  summarise(median_vocab = median(prop_vocab)) ->
  vocab_summary

Exercise 1

ggplot(vocab_summary, aes(x = age, y = median_vocab)) 

Exercise 1

ggplot(vocab_summary, aes(x = age, y = median_vocab)) +
  facet_wrap(~language, ncol = 7) 

Exercise 1

ggplot(vocab_summary, aes(x = age, y = median_vocab)) +
  facet_wrap(~language, ncol = 7) +
  geom_point(size = 0.6) 

Exercise 1

ggplot(vocab_summary, aes(x = age, y = median_vocab)) +
  facet_wrap(~language, ncol = 7) +
  geom_point(size = 0.6) +
  ylim(0, 1) 

Exercise 1

ggplot(vocab_summary, aes(x = age, y = median_vocab)) +
  facet_wrap(~language, ncol = 7) +
  geom_point(size = 0.6) +
  ylim(0, 1) +
  labs(x = "Age (months)", y = "Productive vocabulary size") 

Exercise 1

ggplot(vocab_summary, aes(x = age, y = median_vocab)) +
  facet_wrap(~language, ncol = 7) +
  geom_point(size = 0.6) +
  ylim(0, 1) +
  labs(x = "Age (months)", y = "Productive vocabulary size") +
  theme(strip.text = element_text(size = rel(0.5)))+theme_classic()

— {.build}

Exercise 2

toys <- items_eng_ws %>%
  filter(type == "word", category == "toys")
toys_data <- get_instrument_data(language = "English (American)", form = "WS",
                                 items = toys$item_id,
                                 administrations = admins_eng_ws,
                                 iteminfo = toys) %>%
  mutate(produces = !is.na(value) & value == "produces")

Exercise 2

toys_data 

Exercise 2

toys_data %>%
  group_by(definition, age) 

Exercise 2

toys_data %>%
  group_by(definition, age) %>%
  summarise(prop_produces = sum(produces) / n()) 

Exercise 2

toys_data %>%
  group_by(definition, age) %>%
  summarise(prop_produces = sum(produces) / n()) ->
  toys_summary

Exercise 2

ggplot(toys_summary, aes(x = age, y = prop_produces)) 

Exercise 2

ggplot(toys_summary, aes(x = age, y = prop_produces)) +
  facet_wrap(~definition, ncol = 6) 

Exercise 2

ggplot(toys_summary, aes(x = age, y = prop_produces)) +
  facet_wrap(~definition, ncol = 6) +
  geom_smooth() 

Exercise 2

ggplot(toys_summary, aes(x = age, y = prop_produces)) +
  facet_wrap(~definition, ncol = 6) +
  geom_smooth() +
  labs(x = "Age (months)", y = "Proportion of children producing")+theme_classic()

Exercise 3

toys_data 

Exercise 3

toys_data %>%
  filter(!is.na(sex)) 

Exercise 3

toys_data %>%
  filter(!is.na(sex)) %>%
  group_by(definition, age, sex) 

Exercise 3

toys_data %>%
  filter(!is.na(sex)) %>%
  group_by(definition, age, sex) %>%
  summarise(prop_produces = sum(produces) / n()) 

Exercise 3

toys_data %>%
  filter(!is.na(sex)) %>%
  group_by(definition, age, sex) %>%
  summarise(prop_produces = sum(produces) / n()) ->
  toys_summary_sex

Exercise 3

ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) 

Exercise 3

ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) +
  facet_wrap(~definition, ncol = 6) 

Exercise 3

ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) +
  facet_wrap(~definition, ncol = 6) +
  geom_smooth(aes(colour = sex), se = FALSE) 

Exercise 3

ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) +
  facet_wrap(~definition, ncol = 6) +
  geom_smooth(aes(colour = sex), se = FALSE) +
  scale_colour_ptol(name = "") 

Exercise 3

ggplot(toys_summary_sex, aes(x = age, y = prop_produces)) +
  facet_wrap(~definition, ncol = 6) +
  geom_smooth(aes(colour = sex), se = FALSE) +
  scale_colour_ptol(name = "") +
  labs(x = "Age (months)", y = "Proportion of children producing")+theme_classic()

wordbankr: AoA

wordbankr: AoA

wordbankr: AoA

wordbankr: AoA

fit_aoa(toys_data) %>% select(definition, aoa)
fit_aoa(toys_data, method = "glmrob", proportion = 0.8) %>% select(definition, aoa)

wordbankr: unilemmas

get_crossling_items()
get_crossling_data(uni_lemmas = "dog")

Mini-project!

Use data from Wordbank/childes-db to explore a question about language learning. Some ideas:

Wordbank

  • Explore the relationship between vocabulary size and grammar ability (“complexity” items).
  • Look at the composition of vocabulary – what proportion of words that children know are which lexical category – and how it changes over age.

Resources

Wordbank
wordbank.stanford.edu
github.com/langcog/wordbankr
langcog.github.io/wordbankr
mb-cdi.stanford.edu
Citation: Frank, M. C., Braginsky, M., Yurovsky, D., & Marchman, V. A. (2017). Wordbank: An open repository for developmental vocabulary data. Journal of Child Language, 44(3), 677-694.

childes-db
childes-db.stanford.edu/
github.com/langcog/childesr
childes.talkbank.org
Citation: Sanchez, A., Meylan, S. C., Braginsky, M., MacDonald, K. E., Yurovsky, D., & Frank, M. C. (2019). childes-db: A flexible and reproducible interface to the Child Language Data Exchange System. Behavior Research Methods, 1-14.

This presentation
github.com/mikabr/acq-tools
mikabr.github.io/acq-tools

Contact: mikabr@mit.edu, mcfrank@stanford.edu